# import libraries here; add more as necessary
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Imputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, MiniBatchKMeans
import time
# magic word for producing visualizations in notebook
%matplotlib inline
from IPython.core.display import HTML
from jupyterthemes import jtplot
jtplot.style('grade3')
%config InlineBackend.figure_format = 'retina'
# Load in the general demographics data.
azdias = pd.read_csv('Udacity_AZDIAS_Subset.csv', delimiter=';')
# Load in the feature summary file.
feat_info = pd.read_csv('AZDIAS_Feature_Summary.csv', delimiter=';')
# Check the structure of the data after it's loaded (e.g. print the number of
# rows and columns, print the first few rows).
print('Population raw data shape: {}'.format(azdias.shape))
# create copies of data for later comparison
data_raw = azdias.copy()
features_raw = feat_info.copy()
# helper function
def process_x(string):
pre_result = string.strip('[]').split(
',') # removes brackets and creates list of elements
pre_result = list(map(lambda x: x.strip(),
pre_result)) # removes whitespace
result = list(
map(lambda x: int(x) if x.strip('-').isdigit() else x,
pre_result)) # converts numbers from str to num
if result == ['']:
result = string
return result
else:
return result
# convert missing codes from strings to lists
feat_info['missing_or_unknown'] = feat_info['missing_or_unknown'].apply(
process_x)
# Identify missing or unknown data values and convert them to NaNs.
for i in range(len(feat_info)): # iterate through all columns
unknown_list = feat_info['missing_or_unknown'].iloc[i]
if len(unknown_list) == 0:
pass # skips values that have already been converted to NaNs
for x in range(len(feat_info['missing_or_unknown'].iloc[i])):
# iterate through missing/unknown list
# iterate through missing values for each column, if matches, replace with np.nan
(azdias[feat_info['attribute'][i]].replace(
unknown_list, np.nan, inplace=True))
# compares cleaned data with raw data
print('NaN ENCODED DATA')
print('*' * 50)
print(azdias['ALTERSKATEGORIE_GROB'].unique())
print(azdias['CAMEO_INTL_2015'].unique())
print('RAW ENCODED DATA')
print('*' * 50)
print(data_raw['ALTERSKATEGORIE_GROB'].unique())
print(data_raw['CAMEO_INTL_2015'].unique())
# Perform an assessment of how much missing data there is in each column of the
# dataset.
missing = pd.DataFrame(azdias.isnull().sum()) # sum of missing data
missing.columns = ['sum NaN'] # new column title
# create new column with % of column is NaN
missing['% NaN'] = ((missing['sum NaN'] / len(azdias)) * 100)
# sort descending
missing.sort_values('% NaN', ascending=False, inplace=True)
# merge with feat_info DF to look at data type info level trends
missing = pd.merge(
missing,
feat_info[['information_level', 'type']],
left_on=missing.index,
right_on=feat_info['attribute'])
missing.set_index('key_0', inplace=True, drop=True)
# EASIER THAN HISTOGRAM TO DETERMINE OUTLIERS
print('% NaN BY COLUMN')
display(missing.head(10))
# look at all rows without hiding middle rows
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
display(missing.sort_values('key_0'))
# Investigate patterns in the amount of missing data in each column.
# correlation matrix of % NaN and information_level
info_level_corr = missing.information_level.str.get_dummies().corrwith(
missing['% NaN']).sort_values(ascending=False)
print('INFO LEVEL CORRELATION', '\n')
print('*' * 30)
print(info_level_corr, '\n')
# correlation matrix of % NaN and data type
data_type_corr = missing.type.str.get_dummies().corrwith(
missing['% NaN']).sort_values(ascending=False)
print('DATA TYPE CORRELATION', '\n')
print('*' * 30)
print(data_type_corr)
# Remove the outlier columns from the dataset. (You'll perform other data
# engineering tasks such as re-encoding and imputation later.)
# remove 6 top features with highest NaN percentages
azdias.drop(missing.index[:6], axis=1, inplace=True)
TITEL_KZ, AGER_TYP, KK_KUNDENTYP, KBA05_BAUMAX, GEBURTSJAHR, ALTER_HH
fig, ax = plt.subplots(figsize=(16, 8))
sns.countplot(
azdias.isnull().sum(axis=1).sort_values(ascending=False),
ax=ax,
palette=sns.cubehelix_palette(8, start=.5, rot=-.75))
ax.set_xlabel('NaN in Row', fontdict={'fontsize': 20})
ax.set_ylabel('Frequency', fontdict={'fontsize': 20})
ax.set_title('Distribution of Missing Data by Row', fontdict={'fontsize': 25})
fig.tight_layout()
# Write code to divide the data into two subsets based on the number of missing
# values in each row.
# add a column to DF with sum of NaN values in row
azdias['row NaN'] = azdias.isnull().sum(axis=1)
# create subset of data where sum of NaNs per row is less than or equal to 10
fewer_ten_perc_NaN = azdias.loc[azdias['row NaN'] <= 10]
# create subset of data where sum of NaNs per row is greater than 10
more_ten_perc_NaN = azdias.loc[azdias['row NaN'] > 10]
print('Shape of DF with less than 10 NaNs per row: {}'.format(
fewer_ten_perc_NaN.shape))
# % of original data
print('{:.2%} of original data'.format(
fewer_ten_perc_NaN.shape[0] / len(azdias)))
print('*' * 75)
print('Shape of DF with more than 10 NaNs per row: {}'.format(
more_ten_perc_NaN.shape))
# % of original data
print('{:.2%} of original data'.format(
more_ten_perc_NaN.shape[0] / len(azdias)))
print('*' * 75)
# make sure that sum of subsets' lengths equals sum of original data
print(
'Length of two data frames added together equals len of original data? \n{}'.
format((len(fewer_ten_perc_NaN) + len(more_ten_perc_NaN)) == len(azdias)))
# Compare the distribution of values for at least five columns where there are
# no or few missing values, between the two subsets.
def col_compare(less_NaN_df, more_NaN_df, col):
'''
INPUT:
less_NaN_df: DataFrame with exactly same columns as other DF
more_NaN_df: DataFrame with exactly same columns as other DF
col: str of column label wished to be compared
OUTPUT:
Two plots
'''
f, (ax1, ax2) = plt.subplots(
1, 2, figsize=(8, 3)) # create two axes next to each other
f.dpi = 85 # set resolution
ax1.set_title('<= 10% NaN') # title 1
ax2.set_title('> 10% NaN') # title 2
# first plot
sns.countplot(
x=col,
data=less_NaN_df,
ax=ax1,
palette=sns.cubehelix_palette(8, start=.5, rot=-.75))
# second plot
sns.countplot(
x=col,
data=more_NaN_df,
ax=ax2,
palette=sns.cubehelix_palette(8, start=.5, rot=-.75))
f.tight_layout()
# random columns
rand_columns = azdias.columns.to_series().sample(10)
# plot a bunch (10) of data distributions between two subsets
for i in range(len(rand_columns)):
col_compare(fewer_ten_perc_NaN, more_ten_perc_NaN, rand_columns[i])
Are the data with lots of missing values are qualitatively different from data with few or no missing values?)
# How many features are there of each data type?
# count of data types
print('VALUE COUNTS OF DATA TYPES')
print('*' * 30)
display(
pd.DataFrame(
missing.groupby('type').count()['information_level'].sort_values(
ascending=False)))
# Assess categorical variables: which are binary, which are multi-level, and
# which one needs to be re-encoded?
# create subset of missing dataframe for only columns with the data type of categorical
n_unique = fewer_ten_perc_NaN.nunique()
categorical = missing.loc[missing['type'] == 'categorical']
# merge n_unique df created above
categorical = pd.merge(
categorical,
n_unique.to_frame(),
left_on=categorical.index,
right_on=n_unique.index)
# rename merged column as it came from an unnamed series
categorical.rename(
columns={
0: 'n_unique',
}, inplace=True)
# create new column based off of n_unique, showing binary/multi
categorical['sub_type'] = categorical['n_unique'].apply(
lambda x: 'binary' if x <= 2 else 'multi')
# sort by sub_type
print('CATEGORICAL DATA SORTED BY SUB_TYPE')
display(categorical.sort_values('n_unique'))
print('DATA TYPES OF CATEGORICAL DATA')
display(fewer_ten_perc_NaN[categorical.key_0].dtypes)
print('Three columns are "objects" and need to be taken care of')
# Re-encode categorical variable(s) to be kept in the analysis.
cleaned = fewer_ten_perc_NaN.copy()
# re-encode the 'O' and 'W' values to binary representations
# also removes original column from analysis
cleaned['OST_WEST_KZ'] = cleaned['OST_WEST_KZ'].replace({'O': 0, 'W': 1})
cleaned['OST_WEST_KZ'].unique()
# remove all multi categorical columns from analysis for ease of analysis
cleaned.drop(columns=categorical.loc[categorical['sub_type'] == 'multi'].key_0, inplace=True)
# now has 13 less columns
print('Before dropped columns:', fewer_ten_perc_NaN.shape)
print('After dropped columns:', cleaned.shape)
CAMEO_DEU_2015 columns have too many potential dummy variables that would need to be created and it would be best to remove the columns from the analysis.¶# Investigate "PRAEGENDE_JUGENDJAHRE" and engineer two new variables.
print('{:.2%} NaN BEFORE RE-ENGINEERING'.format(
cleaned['PRAEGENDE_JUGENDJAHRE'].isnull().sum() / len(
cleaned['PRAEGENDE_JUGENDJAHRE'])))
fig, ax = plt.subplots(figsize=(16, 8))
ax.set_title('Distribution of PRAEGENDE_JUGENDJAHRE values')
sns.countplot(
data=cleaned,
x='PRAEGENDE_JUGENDJAHRE',
ax=ax,
palette=sns.cubehelix_palette(8, start=.5, rot=-.75))
fig.tight_layout()
print('MAJORITY OF VALUES ARE 14, WHICH ARE MAINSTREAM DIGITAL MEDIA KIDS')
Dominating movement of person's youth (avantgarde vs. mainstream; east vs. west)
We will create values for decades like this:
And movement like this:
# maps the existing decade codes to create 6 new variables
decades_map = {
1: 1,
2: 1,
3: 2,
4: 2,
5: 3,
6: 3,
7: 3,
8: 4,
9: 4,
10: 5,
11: 5,
12: 5,
13: 5,
14: 6,
15: 6
}
# maps the existing movement ints to create 2 new variables
movements_map = {
1: 1,
2: 2,
3: 1,
4: 2,
5: 1,
6: 2,
7: 2,
8: 1,
9: 2,
10: 1,
11: 2,
12: 1,
13: 2,
14: 1,
15: 2
}
# apply and replace the old values with the mapped dictionaries above
cleaned['DECADE'] = cleaned.PRAEGENDE_JUGENDJAHRE.map(decades_map)
cleaned['MOVEMENT'] = cleaned.PRAEGENDE_JUGENDJAHRE.map(movements_map)
fig, ax = plt.subplots(figsize=(16, 8))
ax.set_title(
'Distribution of PRAEGENDE_JUGENDJAHRE Encoded DECADE Values',
fontdict={'fontsize': 20})
sns.countplot(
data=cleaned,
x='DECADE',
ax=ax,
palette=sns.cubehelix_palette(8, start=.5, rot=-.75))
fig.tight_layout()
print('{:.2%} NaN DECADE AFTER RE-ENGINEERING'.format(
cleaned['DECADE'].isnull().sum() / len(cleaned['DECADE'])))
# same as before, as to be expected
fig, ax = plt.subplots(figsize=(12, 6))
ax.set_title(
'Distribution of PRAEGENDE_JUGENDJAHRE Encoded MOVEMENT Values',
fontdict={'fontsize': 20})
sns.countplot(
data=cleaned,
x='MOVEMENT',
ax=ax,
palette=sns.cubehelix_palette(8, start=.5, rot=-.75))
fig.tight_layout()
print('{:.2%} NaN MOVEMENT AFTER RE-ENGINEERING'.format(
cleaned['MOVEMENT'].isnull().sum() / len(cleaned['MOVEMENT'])))
# same as before, as to be expected
# Investigate "CAMEO_INTL_2015" and engineer two new variables.
fig, ax = plt.subplots(figsize=(16, 8))
ax.set_title(
'Distribution of CAMEO_INTL_2015 Values', fontdict={'fontsize': 20})
sns.countplot(
data=cleaned.sort_values('CAMEO_INTL_2015'),
x='CAMEO_INTL_2015',
ax=ax,
palette=sns.cubehelix_palette(8, start=.5, rot=-.75))
# PERECENT MISSING VALUES FROM COLUMN
print('{:.2%} NaN BEFORE RE-ENGINEERING'.format(
cleaned['CAMEO_INTL_2015'].isnull().sum() / len(
cleaned['CAMEO_INTL_2015'])))
German CAMEO: Wealth / Life Stage Typology, mapped to international code
# CREATING NEW VARIABLES, EXTRACTING FIRST AND SECOND CHARACTERS AND MAPPING THEM TO A NEW COLUMN
cleaned['WEALTH'] = cleaned['CAMEO_INTL_2015'].apply(
lambda x: int(str(x)[0]) if not pd.isna(x) else None)
cleaned['LIFE_STAGE'] = cleaned['CAMEO_INTL_2015'].apply(
lambda x: int(str(x)[1]) if not pd.isna(x) else None)
# NEW SEPARATED VARIABLES
cleaned[['CAMEO_INTL_2015', 'WEALTH', 'LIFE_STAGE']].sample(10)
fig, ax = plt.subplots(figsize=(16, 8))
ax.set_title('Distribution of WEALTH Values', fontdict={'fontsize': 20})
sns.countplot(data=cleaned, x='WEALTH', ax=ax,
palette=sns.cubehelix_palette(8, start=.5, rot=-.75));
print('{:.2%} NaN WEALTH AFTER RE-ENGINEERING'.format(
cleaned['WEALTH'].isnull().sum() / len(cleaned['WEALTH'])))
# SAME PERCENT AS BEFORE, WHICH IS EXPECTED
fig, ax = plt.subplots(figsize=(16, 8))
ax.set_title('Distribution of LIFE_STAGE Values', fontdict={'fontsize': 20})
sns.countplot(
data=cleaned,
x='LIFE_STAGE',
ax=ax,
palette=sns.cubehelix_palette(8, start=.5, rot=-.75))
fig.tight_layout()
print('{:.2%} NaN LIFE_STAGE AFTER RE-ENGINEERING'.format(
cleaned['LIFE_STAGE'].isnull().sum() / len(cleaned['LIFE_STAGE'])))
# SAME PERCENT AS BEFORE, WHICH IS EXPECTED
# drop columns that are multi-level that aren't being included in the analysis
mixed = feat_info[feat_info['type'] == 'mixed']
mixed = mixed.drop(
index=64) # remove high NaN column removed from actual data earlier
mixed_unique = cleaned[mixed.attribute].nunique()
# remove columns that are no longer needed
cleaned = cleaned.drop('row NaN', axis=1)
cleaned = cleaned.drop(mixed.attribute, axis=1)
print('CLEANED DATA PREVIEW')
display(cleaned.sample(5))
PRAEGENDE_JUGENDJAHRECAMEO_INTL_2015LP_LEBENSPHASE_FEINLP_LEBENSPHASE_GROBWOHNLAGEPLZ8_BAUMAXDECADE: The individual's decade associated with their youth (data ranges from 1 - 6)MOVEMENT: Movement the individual aligns with their youth (1: Mainstream, 2: Avantgarde)WEALTH: Integer ranging from 1 to 5 indiating level of household wealth (1 is wealthy, 5 poorer)LIFE_STAGE: Integer ranging from 1 to 5 indicating life stage (1 is young, 5 is elderly)print('ALL NUMERIC DATA TYPES! Ready for analysis')
print('*' * 50)
display(cleaned.info())
# helper functions for cleaning
def missing_to_NaN(df):
'''
INPUT: DataFrame
OUTPUT: DataFrame with unknown value codes converted to NaNs
'''
for i in range(len(feat_info)): # iterate through all columns
unknown_list = feat_info['missing_or_unknown'].iloc[i]
if len(unknown_list) == 0:
pass # skips values that have already been converted to NaNs)
for x in range(len(feat_info['missing_or_unknown'].iloc[
i])): # iterate through missing/unknown list
# iterate through missing values for each column, if matches, replace with np.nan
(df[feat_info['attribute'][i]].replace(
unknown_list, np.nan, inplace=True))
return df
def split_based_on_NaN(df, percent_split=10):
'''
INPUT:
- DataFrame
- Integer of percent threshold to split DataFrame by (default 10)
OUTPUT:
- Low NaN DataFrame, High NaN DataFrame
'''
# add a column to DF with sum of NaN values in row
df['row_NaN'] = df.isnull().sum(axis=1)
# create subset of data where sum of NaNs per row is less than or equal to 10
df_lowNaN = df.loc[df['row_NaN'] <= percent_split]
# create subset of data where sum of NaNs per row is greater than 10
df_highNaN = df.loc[df['row_NaN'] > percent_split]
return df_lowNaN, df_highNaN
def encode_and_remove_categ(df):
'''
INPUT:
- DataFrame
OUTPUT:
- DataFrame with binary values numerically
encoded and multi-level columns removed
'''
# encode binary column to numeric values
df['OST_WEST_KZ'] = df['OST_WEST_KZ'].replace({'O': 0, 'W': 1})
# drop multi level categorical features
df = df.drop(
columns=categorical.loc[categorical['sub_type'] == 'multi'].key_0)
return df
def create_mixed_features(df):
'''
INPUT:
- DataFrame
OUTPUT:
- DataFrame with new variables created from mixed features, removes old features
'''
decades_map = {
1: 1,
2: 1,
3: 2,
4: 2,
5: 3,
6: 3,
7: 3,
8: 4,
9: 4,
10: 5,
11: 5,
12: 5,
13: 5,
14: 6,
15: 6
}
movements_map = {
1: 1,
2: 2,
3: 1,
4: 2,
5: 1,
6: 2,
7: 2,
8: 1,
9: 2,
10: 1,
11: 2,
12: 1,
13: 2,
14: 1,
15: 2
}
df['DECADE'] = df.PRAEGENDE_JUGENDJAHRE.map(decades_map)
df['MOVEMENT'] = df.PRAEGENDE_JUGENDJAHRE.map(movements_map)
df['WEALTH'] = df['CAMEO_INTL_2015'].apply(
lambda x: int(str(x)[0]) if not pd.isna(x) else None)
df['LIFE_STAGE'] = df['CAMEO_INTL_2015'].apply(
lambda x: int(str(x)[1]) if not pd.isna(x) else None)
mixed = feat_info[feat_info['type'] == 'mixed']
# remove high NaN column removed from actual data earlier
mixed = mixed.drop(index=64)
mixed_unique = df[mixed.attribute].nunique()
df = df.drop('row_NaN', axis=1)
df = df.drop(mixed.attribute, axis=1)
return df
def clean_data(df):
"""
Perform feature trimming, re-encoding, and engineering for demographics
data
INPUT:
- Demographics DataFrame
OUTPUT:
- Trimmed and cleaned demographics DataFrame, high NaN DataFrame
"""
print('=> encoding missing values as NaN')
df = missing_to_NaN(df)
# remove selected columns and rows, ...
print('=> removing columns selected')
columns_to_remove = missing.index[:6] # same columns as above
df = df.drop(columns_to_remove, axis=1)
# split into two datasets, one with high NaNs and one with low NaNs
print(
'=> splitting into two datasets, one with high NaNs and one with low NaNs'
)
df, df_highNaN = split_based_on_NaN(df)
# select, re-encode, and engineer column values.
print(
'=> re-encoding binary values and removing multi-level categorical columns'
)
df = encode_and_remove_categ(df)
print(
'=> creating mixed features and removing other mixed feature columns')
df = create_mixed_features(df)
# Return the cleaned dataframes
print('=> done!')
return df, df_highNaN
test, test_high = clean_data(data_raw)
print('TEST HIGH NaN')
print(test_high['ALTERSKATEGORIE_GROB'].sort_values().unique())
print(test_high.shape)
print('--------------------------')
print('TEST')
print(test['ALTERSKATEGORIE_GROB'].sort_values().unique())
print(test.shape)
print('--------------------------')
print('CONTROL DATA')
print(cleaned['ALTERSKATEGORIE_GROB'].sort_values().unique())
print(cleaned.shape)
# percent of nans remaining in cleaned data, per top 15 columns
(cleaned.isnull().mean(axis=0) * 100).sort_values(ascending=False).head(15)
W_KEIT_KIND_HH has the highest missing values per COLUMN, with two others not too far behind¶# proportion of NaNs for each row
row_nans = (cleaned.isnull().mean(axis=1) * 100).sort_values(ascending=False)
print('5 rows with highest % of NaNs')
print('-------------------------------')
print(row_nans.head())
# calculate how many rows have NaNs
rows_w_NaNs = len(row_nans[row_nans > 0])
print('')
print('-------------------------------')
print('{} rows have at least one NaN'.format(rows_w_NaNs))
print('{:.2%} of rows contain at least one NaN, too much!'.format(
rows_w_NaNs / len(cleaned)))
# how many NaNs as a percentage of the entire dataset?
total_NaN_perc = (
cleaned.isnull().sum().sum() / (cleaned.shape[0] * cleaned.shape[1]))
print('')
print('-------------------------------')
print("""{:.2%} of all feature values are NaNs which is reasonable,
but we can\'t simply discard them because it would remove 1/5 of our data"""
.format(total_NaN_perc))
print('')
print('-------------------------------')
print(
"""We need to remove all of the NaNs, scale our data on the removed NaN-removed
data, and then impute the NaNs on the full dataset""")
# fitting the scaler object on data with NaNs removed to not skew results
cleaned_dropped_NaN = cleaned.dropna(
) # defaults to dropping rows with at least one NaN
scaler = StandardScaler() # initialise the scaler object
print(cleaned_dropped_NaN.shape)
display(scaler.fit(cleaned_dropped_NaN)) # fit to cleaned dropped NaN data
# imputing missing values using each column's respective mean
imputer_obj = Imputer() # default args, missing='NaN' strategy='mean', axis=0
imputed = imputer_obj.fit_transform(
cleaned) # fit to cleaned data with NaNs (no labels)
# RETURNS A NUMPY ARRAY
print('Imputed type: {}'.format(type(imputed)))
print('Imputed shape: {}'.format(imputed.shape))
# still need to transform the imputed data with the scaler that was fit above
imputed_scaled = scaler.transform(imputed)
print('Imputed_scaled type: {}'.format(type(imputed_scaled)))
print('Imputed_scaled shape: {}'.format(imputed_scaled.shape))
# convert scaled and imputed data back into a pandas DF
clean_scaled = pd.DataFrame(imputed_scaled, columns=cleaned.columns)
cleaned_scaled_NaNs = clean_scaled.isnull().sum().sum()
print('Scaled data shape: {}'.format(clean_scaled.shape))
print('-' * 40)
print('{} NaNs after imputation and scaling! :)'.format(cleaned_scaled_NaNs))
print('')
print('Let\'s look at how the data looks now')
print('-' * 40)
display(clean_scaled.describe())
def col_compare(df1, df2, col, title_1, title_2):
'''
INPUT:
df1: DataFrame with exactly same columns as other DF
df2: DataFrame with exactly same columns as other DF
col: str of column label wished to be compared
title_1: str of title for first plot
title_2: str of title for second plot
OUTPUT:
Two plots
'''
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))
f.dpi = 85
ax1.set_title(title_1)
ax2.set_title(title_2)
sns.countplot(
x=col,
data=df1,
ax=ax1,
palette=sns.cubehelix_palette(8, start=.5, rot=-.75))
sns.countplot(
x=col,
data=df2,
ax=ax2,
palette=sns.cubehelix_palette(8, start=.5, rot=-.75))
f.tight_layout()
col_compare(
cleaned, clean_scaled, 'FINANZ_MINIMALIST',
'Distribution of FINANZ_MINIMALIST Values Prior to Scaling & Imputation',
'Distribution of FINANZ_MINIMALIST Values After Scaling & Imputation')
# DISTRIBUTION OF VALUES IS THE SAME BEFORE AND AFTER SCALING / IMPUTING
StandardScaler with; I then used this scaler to impute the mean (which is zero), which stabalises the data more than leaving NaNs.¶# create PCA object
pca_obj = PCA(random_state=41).fit(clean_scaled)
def scree(pca):
n_comps = len(pca.components_) # number of components
x_axis = np.arange(n_comps) # axis for graph
comp_vars = pca.explained_variance_ratio_ # variance values per component
fig, ax = plt.subplots(figsize=(16, 8))
ax.set_title(
'Variance Explained by Each Principal Component',
fontdict={'fontsize': 20})
ax.set_ylabel('Variance %')
ax.set_xlabel('Component')
sns.barplot(
x_axis,
comp_vars,
palette=sns.cubehelix_palette(8, start=.5, rot=-.75),
ax=ax)
ax.plot(np.cumsum(comp_vars), marker='o', ms=6, c='black')
ax.set_xticks(x_axis)
total_var = np.cumsum(comp_vars[:n_comps])
for i in range(n_comps):
ax.annotate(
r"%s%%" % ((str(comp_vars[i] * 100)[:4])),
(x_axis[i] + 0.1, comp_vars[i] + .01),
va="bottom",
ha="center",
fontsize=12)
ax.annotate(
'Total variance explained\nthrough {} components: {:.2%}'.format(
n_comps, total_var[-1]),
xy=(x_axis.mean(), total_var[int(n_comps / 2)]),
xycoords='data',
size=20,
textcoords='offset pixels',
xytext=(1, -100))
fig.tight_layout()
scree(pca_obj)
# Re-apply PCA to the data while selecting for number of components to retain.
pca_obj_15 = PCA(
n_components=15, random_state=41).fit(
clean_scaled) # create PCA object with specified component amount
PCA_df = pd.DataFrame(pca_obj_15.fit_transform(
clean_scaled)) # create DF, fitting and transforming dataframe
# plot
scree(pca_obj_15)
print('PCA_df shape: {}'.format(PCA_df.shape))
print('First five rows of new DataFrame')
display(PCA_df.head())
print('Summary statistics of new DataFrame')
display(PCA_df.describe())
(Double-click this cell and replace this text with your own text, reporting your findings and decisions regarding dimensionality reduction. How many principal components / transformed features are you retaining for the next step of the analysis?)
# Map weights for the first principal component to corresponding feature names
# and then print the linked values, sorted by weight.
# HINT: Try defining a function here or in a new cell that you can reuse in the
# other cells.
def map_weight_feats(pca_obj, df, component):
'''
INPUT:
pca_obj - Fit PCA object
df - DataFrame with full data
component - integer (starting at 0) of component
OUTPUT:
positive_comp - pandas series with positive weights
negative_comp - pandas series with negative weights
'''
w = pd.DataFrame(np.round(pca_obj.components_, 6), columns=df.keys())
comp = w.iloc[component, :]
positive_comp = comp[comp > 0].sort_values(ascending=False)
negative_comp = comp[comp < 0].sort_values(ascending=True)
return positive_comp, negative_comp
pos_comp_0, neg_comp_0 = map_weight_feats(pca_obj_15, clean_scaled, 0)
print('*' * 40)
print('Positive weights for component 0')
print('*' * 40)
print(pos_comp_0.head(10))
print('*' * 40)
print('Negative weights for component 0')
print('*' * 40)
print(neg_comp_0.head(10))
print('*' * 40)
PLZ8_ANTG3: Number of 6-10 family buildings in the PLZ8 region. Among the other household sizes in the PLZ8 macro-cell features, it appears that there is a higher share of medium apartment complexes.PLZ8_ANTG4: Number of 10+ family buildings in the PLZ8 region. Among the other household sizes in the PLZ8 macro-cell features, it appears that there is a higher share of large multi-dwelling units.HH_EINKOMMEN_SCORE: Estimated household net income. Higher positive weight means higher proportion of lower income households.WEALTH: This is the variable we engineered from a multi-level variable. Houehold wealth (1-5, where 5 is the poorer end). Indicative of poorer households.EWDICHTE: Density of houehold per square kilometer, where 1 is less than 34 households per km^2 and 6 is more than 999 households per km^2. Indicative of more dense communitiesMOBI-REGIO: Movement patterns, (1: very high, 6: none). Low movementPLZ8_ANTG1: Number of 1-2 family houses in PLZ8 region (0: None, 4: very high share). High weight indicative of large proportion of 1-2 family homesKBA05_ANTG1: Number of 1-2 family houses in the microcell. (0: no 1-2 family homes, 4: very high share of 1-2 family homes)FINANZ_MINIMALIST: Financial typology for minimalist dimension. (1: very high, 5: very low). Indicative of low financial minimalistic mindsetKBA05_GBZ: Number of buildings in the microcell (1: 1-2 buildings, 5: >= 23 buildings) High number of buildings in the microcell# Map weights for the second principal component to corresponding feature names
# and then print the linked values, sorted by weight.
pos_comp_1, neg_comp_1 = map_weight_feats(pca_obj_15, clean_scaled, 1)
print('*' * 40)
print('Positive weights for component 1')
print('*' * 40)
print(pos_comp_1.head(10))
print('*' * 40)
print('Negative weights for component 1')
print('*' * 40)
print(neg_comp_1.head(10))
print('*' * 40)
ALTERSKATEGORIE_GROB: Age (estimated) based on given name analysis (0: can't be determined, 4: > 60 years old, 9: uniformly distributed) Higher values mean older/evenly distributedFINANZ_VORSORGER: Financial preparation (1: very high, 5: very low) Low financial preparationSEMIO_ERL: Personality typology, event driven. (1: highest affinity, 7: lowest affinity, 9: unknown) Low event-driven affinitySEMIO_LUST: Personality typology,sensual-minded. (1: highest affinity, 7: lowest affinity, 9: unknown) Low sensual-minded affinityRETOURTYP_BK_S: Return type (1: influenceable Crazy-Shopper, 5: determined Minimal-Returner) Determined minimal returnerSEMIO_REL: Personality typology, religious affinity. (1: highest affinity, 7: lowest affinity, 9: unknown) Lower religious affinityDECADE: Decade individual associates with their youth, engineered from multi-level variable. Generation on younger-endFINANZ_SPARER: Financial typology, money-saver. (1: very high, 5: very low) Low affiliation with saving moneyFINANZ_UNAUFFAELLIGER: Financial typology, inconspicuous. (1: very high, 5: very low) Low financial inconspicuousitySEMIO_TRADV: Personality typology, traditional-minded. (1: highest affinity, 7: lowest affinity, 9: unknown) Less traditional minded# Map weights for the third principal component to corresponding feature names
# and then print the linked values, sorted by weight.
pos_comp_2, neg_comp_2 = map_weight_feats(pca_obj_15, clean_scaled, 2)
print('*' * 40)
print('Positive weights for component 2')
print('*' * 40)
print(pos_comp_2.head(10))
print('*' * 40)
print('Negative weights for component 2')
print('*' * 40)
print(neg_comp_2.head(10))
print('*' * 40)
SEMIO_VERT: Personality typology, dreamful. (1: highest affinity, 7: lowest affinity, 9: unknown) Lower dreamful affinitySEMIO_SOZ: Personality typology, socially-minded. (1: highest affinity, 7: lowest affinity, 9: unknown) Lower socially-minded affinitySEMIO_FAM: Personality typology, family-minded. (1: highest affinity, 7: lowest affinity, 9: unknown) Lower family-minded affinitySEMIO_KULT: Personality typology, cultural-minded. (1: highest affinity, 7: lowest affinity, 9: unknown) Lower cultural-minded affinityFINANZ_MINIMALIST: Financial typology for minimalist dimension. (1: very high, 5: very low). Indicative of low financial minimalistic mindsetANREDE_KZ: Gender - (1: male, 2: female) Predominantly femaleSEMIO_KAEM: Personality typology, combatitive-attitude. (1: highest affinity, 7: lowest affinity, 9: unknown) Lower combatitive-attitude affinitySEMIO_DOM: Personality typology, dominant-minded. (1: highest affinity, 7: lowest affinity, 9: unknown) Lower dominant-minded affinitySEMIO_KRIT: Personality typology, critical-minded. (1: highest affinity, 7: lowest affinity, 9: unknown) Lower critical-minded affinitySEMIO_RAT: Personality typology, rational. (1: highest affinity, 7: lowest affinity, 9: unknown) Lower affinity with rationalityANREDE_KZ which means we can infer the positive traits belong to men as the negative indicates female gender. Specifically, that men are less dreamful, less socially-minded, less family-minded, less culturally-minded, and are not financial minimialists.¶ANREDE_KZ which means we can infer the positive traits belong to men as the negative indicates female gender. Specifically, that men are less dreamful, less socially-minded, less family-minded, less culturally-minded, and are not financial minimialists. Women, however, are less combatitive, less dominant-minded, less critical, and less rational¶def get_kmeans_score(data, num_clusters):
return np.abs(KMeans(n_clusters=num_clusters).fit(data).score(data))
results = {}
print('*' * 50)
num_centroids = range(2, 21)
glob_start = time.time()
for i in num_centroids:
loop_start = time.time()
results[i] = get_kmeans_score(clean_scaled, i)
print('=> iteration {} of {}'.format(i - 1, len(num_centroids)))
print('=> score with {} clusters: {:,.2f}'.format(i, results[i]))
if i == 2:
loop_end = time.time()
iteration_time = loop_end - loop_start
print('=> iteration run time: {:.0f}m {:.0f}s '.format(
iteration_time // 60, iteration_time % 60))
print('*' * 50)
continue
else:
delta = (results[i - 1] - results[i]) / results[i - 1]
loop_end = time.time()
iteration_time = loop_end - loop_start
print('=> {:.2%} improvement from previous iteration'.format(delta))
print('=> iteration run time: {:.0f}m {:.0f}s '.format(
iteration_time // 60, iteration_time % 60))
print('*' * 50)
glob_end = time.time()
total_time = glob_end - glob_start
print('=> total run time: {:.0f}m {:.0f}s '.format(total_time // 60,
total_time % 60))
total_delta = (results[list(results.keys())[0]] - results[list(
results.keys())[-1]]) / results[list(results.keys())[0]]
print('=> {:.2%} improvement from first iteration'.format(total_delta))
print('*' * 50)
# plot reduction of SSE with incrementally increasing number of centroids
centroids = list(num_centroids)
fig, ax = plt.subplots(figsize=(16, 8))
ax.plot(
centroids,
results.values(),
ls='--',
marker='o',
lw=2.5,
ms=10,
color='gray',
mfc='g',
mec='w',
mew=1.5)
ax.annotate(
'Elbow: 8 clusters',
xy=(8, results[8]),
arrowprops=dict(facecolor='gray', shrink=0.05, width=2),
xytext=(10, results[7]),
size=18)
ax.set_ylabel('Sum of Squared Error', fontdict={'fontsize': 18})
ax.set_xlabel('Number of Clusters', fontdict={'fontsize': 18})
ax.set_title(
'Sum of Squared Error vs. Number of Clusters', fontdict={'fontsize': 25})
fig.tight_layout()
# ELBOW OBSERVED AT 8 CLUSTERS
clusters = 8
# refit model with chosen clusters
km = KMeans(n_clusters=clusters, random_state=41).fit(PCA_df)
print('Dimensions of clusters (clusters, components)',
km.cluster_centers_.shape)
print('Dimensions of labels', km.labels_.shape)
# Load in the customer demographics data.
customers = pd.read_csv('Udacity_CUSTOMERS_Subset.csv', delimiter=';')
print('First five rows of customer data')
display(customers.head())
print('Descriptive Statistics')
display(customers.describe())
customers_cleaned, customers_cleaned_highNaN = clean_data(customers)
# COMPARING CLEANED DATA RETURNED
print('CUSTOMER HIGH NaN')
print(customers_cleaned_highNaN['ALTERSKATEGORIE_GROB'].sort_values().unique())
print(customers_cleaned_highNaN.shape)
print('--------------------------')
print('CUSTOMER CLEANED')
print(customers_cleaned['ALTERSKATEGORIE_GROB'].sort_values().unique())
print(customers_cleaned.shape)
print('--------------------------')
print('CONTROL DATA (FULL DATA SET)')
print(cleaned['ALTERSKATEGORIE_GROB'].sort_values().unique())
print(cleaned.shape)
display(customers_cleaned.describe())
# fitting the scaler object on data with NaNs removed to not skew results
cust_cleaned_dropped_NaN = customers_cleaned.dropna(
axis=0) # defaults to dropping rows with at least one NaN
print(cust_cleaned_dropped_NaN.shape) # fit to cleaned dropped NaN data
# imputing missing values using each column's respective mean
cust_imputed = imputer_obj.transform(
customers_cleaned) # fit to cleaned data with NaNs (no labels)
# RETURNS A NUMPY ARRAY
print('Customer Imputed type: {}'.format(type(cust_imputed)))
print('Customer Imputed shape: {}'.format(cust_imputed.shape))
# still need to transform the imputed data with the scaler that was fit above
cust_imputed_scaled = scaler.transform(cust_imputed)
print('Customer Imputed_scaled type: {}'.format(type(cust_imputed_scaled)))
print('Customer Imputed_scaled shape: {}'.format(cust_imputed_scaled.shape))
# convert scaled and imputed data back into a pandas DF
cust_clean_scaled = pd.DataFrame(
cust_imputed_scaled, columns=customers_cleaned.columns)
cust_cleaned_scaled_NaNs = cust_clean_scaled.isnull().sum().sum()
print('Scaled customer data shape: {}'.format(cust_clean_scaled.shape))
print('-' * 40)
print('{} NaNs after imputation and scaling! :)'.format(
cust_cleaned_scaled_NaNs))
print('')
print('Let\'s look at how the data looks now')
print('-' * 40)
display(cust_clean_scaled.describe())
cust_pca_obj = pca_obj_15.transform(cust_clean_scaled)
display(cust_pca_obj.shape)
cust_pca_df = pd.DataFrame(cust_pca_obj)
display(cust_pca_df.describe())
cust_kmeans = km.predict(cust_pca_obj)
display(pd.Series(cust_kmeans).value_counts())
display(cust_kmeans.shape)
# GENERAL POPULATION
gen_centr_labels = km.labels_ # population dataset labels from kmeans object
# add back NaN from earlier as a cluter of its own with label -1
gen_centr_labels_NaN = np.append(gen_centr_labels, [-1] * test_high.shape[0])
# value counts for each centroid
gen_centr_value_counts = np.unique(gen_centr_labels_NaN, return_counts=True)
gen_centr_value_counts = np.array(
[[x, y]
for x, y in zip(gen_centr_value_counts[0], gen_centr_value_counts[1])])
# proportion of data associated with each centroid
gen_centr_prop = gen_centr_value_counts[:, 1] / gen_centr_value_counts[:,1].sum()
# combine the value counts and the proportion of each cluster
gen_centr_freq = (np.array(
[[x, y, z]
for x, y, z in zip(gen_centr_value_counts[:, 0],
gen_centr_value_counts[:, 1], gen_centr_prop)]))
##############################################################################
# CUSTOMER DATA SET
# labels/predictions created above
cust_centr_labels = cust_kmeans
# add back NaN from earlier as a cluter of its own with label -1
cust_centr_labels_NaN = (np.append(cust_centr_labels,
[-1] * customers_cleaned_highNaN.shape[0]))
# value counts for each centroid
cust_centr_value_counts = np.unique(cust_centr_labels_NaN, return_counts=True)
cust_centr_value_counts = np.array(
[[x, y]
for x, y in zip(cust_centr_value_counts[0], cust_centr_value_counts[1])])
# proportion of data associated with each centroid
cust_centr_prop = (
cust_centr_value_counts[:, 1] / cust_centr_value_counts[:, 1].sum())
# combine the value counts and the proportion of each cluster
cust_centr_freq = (np.array(
[[x, y, z]
for x, y, z in zip(cust_centr_value_counts[:, 0],
cust_centr_value_counts[:, 1], cust_centr_prop)]))
# comparing the two datasets to see how evenly they are distributed
fig, axes = plt.subplots(1, 2, figsize=(14, 8), sharey=True)
sns.barplot(
x=gen_centr_freq[:, 0], y=gen_centr_freq[:, 2], color='blue', ax=axes[0])
sns.barplot(
x=cust_centr_freq[:, 0], y=cust_centr_freq[:, 2], color='g', ax=axes[1])
axes[0].set_title('Population', fontdict={'fontsize': 20})
axes[1].set_title('Customers', fontdict={'fontsize': 20})
fig.tight_layout()
# Calculate the differences between the two datasets
diff_prop = cust_centr_freq[:, 2] - gen_centr_freq[:, 2]
pos_diff = pd.Series(diff_prop > 0) # boolean array for conditional plotting
# plot the differences for visual inference
fig, ax = plt.subplots(figsize=(16, 8))
i = cust_centr_freq[:, 0]
wid = 0.50
op = 0.85
ax.bar(
i,
diff_prop,
wid,
alpha=op,
color=pos_diff.map({
True: 'g',
False: 'r'
}),
edgecolor='w')
ax.set_xticks(i)
ax.set_title(
'Difference between Customer and Population clusters',
fontdict={'fontsize': 25})
ax.set_xlabel('Cluster number', fontdict={'fontsize': 18})
ax.set_ylabel('Difference between proportions', fontdict={'fontsize': 18})
fig.tight_layout()
# we need to add the cluster predictions to the customer dataframe to look at the clusters
cust_pca_df['CENTROID'] = cust_kmeans
cust_pca_df.rename(
columns={
0: 'FinancialStand-PopDensity',
1: 'Gen-BuyingHabits-Age',
2: 'Gender-Personality'
},
inplace=True)
# adapt earlier function to work with kmeans object
def map_weight_feats_kmeans(kmeans, df, centr):
'''
INPUT:
kmeans - Fit KMeans object
df - DataFrame with full data
centr - integer of centroid/cluster index
OUTPUT:
positive_comp - pandas series with positive weights
negative_comp - pandas series with negative weights
'''
# dataframe with cluster weights
w = pd.DataFrame(np.round(kmeans.cluster_centers_, 6), columns=df.keys())
# grab cluster of interest by index
clust = w.iloc[centr, :]
# sort weights into neg/pos dataframes
positive_clust = clust[clust > 0].sort_values(ascending=False)
negative_clust = clust[clust < 0].sort_values(ascending=True)
return positive_clust, negative_clust
pos_clust_over, neg_clust_over = map_weight_feats_kmeans(
km, cust_pca_df.drop('CENTROID', axis=1), 3)
print('*' * 40)
print('Positive weights for cluster 3')
print('*' * 40)
print(pos_clust_over.head())
print('*' * 40)
print('Negative weights for cluster 3')
print('*' * 40)
print(neg_clust_over.head())
print('*' * 40)
pos_clust_under, neg_clust_under = map_weight_feats_kmeans(
km, cust_pca_df.drop('CENTROID', axis=1), 2)
print('*' * 40)
print('Positive weights for cluster 2')
print('*' * 40)
print(pos_clust_under.head())
print('*' * 40)
print('Negative weights for cluster 2')
print('*' * 40)
print(neg_clust_under.head())
print('*' * 40)
# pairplot of different clusters and their values for the
# top three principal components
p = sns.pairplot(
cust_pca_df,
vars=[
'FinancialStand-PopDensity', 'Gen-BuyingHabits-Age',
'Gender-Personality'
],
diag_kind='kde',
hue="CENTROID",
size=6,
palette=sns.hls_palette(8, l=.50, h=6 / 8, s=.70))
In the Difference between Population and Customer clusters graph, you can see that cluster three clearly over-represented in the customer data when compared to the demographic data. Clusters 1, 2, and 6 are underrepresented, conversely (cluster 2 moreso than the others). From our initial analysis of the first three principal components we surmised that the 0th component encapsulated the financial standing, and population density of an individual. The 1st component encapsulated the individual's generation, their buying habits, and their age. The 2nd component captured their gender, and their major personality traits.
In the above pairplot of the first three components, all points are segmented into their respective clusters/centroids. ### - Cluster #3 (yellow) is the cluster we deemed overrepresented above. If you analyse across the horizontal axes by each principal component you will see that:
FinancialStand-PopDensity) are lower, the values for the 1st component (Gen-BuyingHabits-Age) are ambivalent/undiscernably positive/negative, and the 2nd component (Gender-Personality) has values well above zero. Gender-Personality is very high, as is FinancialStand-PopDensity. Remember, during the breakdown of the composition of the initial components, we observed that ANREDE_KZ which means we can infer the positive traits belong to men as the negative indicates female gender. Specifically, that men are less dreamful, less socially-minded, less family-minded, less culturally-minded, and are not financial minimialists"FinancialStand-PopDensity) are much higher than compared to cluster #3, and the values for the 1st component Gen-BuyingHabits-Age are very negative. We can observe, therefore, that the underrepresented cluster/group are: